require 'rubygems'
require_gem 'hpricot'

class HTMLContent
  attr_reader :identifire, :mtime

  def initialize(identifire, mtime)
    @identifire = identifire
    @mtime = mtime
  end

  def extract_words
    content = String.new
    File.open(@identifire) do |file|
      content = strip_tags(file.read()).downcase
    end

    content.scan(/[-+]?\w[\-\w]{2,}/) do |word|
      yield word
    end
  end

  private 
  def strip_tags(content)
    doc = Hpricot(content)   
    new_content = String.new 

    doc.traverse_text do |t| 
      t.to_s.scan(/\w+/) do |w|
        new_content << " #{w}"
      end
    end
    new_content
  end

end
